import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.express as px
df = pd.read_csv("https://docs.google.com/spreadsheets/d/17_svn8lKuMPh4sl01a8Fca656yLRwbkYD2osTgmrvi8/export?format=csv")
ml_df = df.copy() # Copy of the DataFrame for Machine Learning
df.head()
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FDA15 | 9.30 | Low Fat | 0.016047 | Dairy | 249.8092 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 3735.1380 |
| 1 | DRC01 | 5.92 | Regular | 0.019278 | Soft Drinks | 48.2692 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 443.4228 |
| 2 | FDN15 | 17.50 | Low Fat | 0.016760 | Meat | 141.6180 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 2097.2700 |
| 3 | FDX07 | 19.20 | Regular | 0.000000 | Fruits and Vegetables | 182.0950 | OUT010 | 1998 | NaN | Tier 3 | Grocery Store | 732.3800 |
| 4 | NCD19 | 8.93 | Low Fat | 0.000000 | Household | 53.8614 | OUT013 | 1987 | High | Tier 3 | Supermarket Type1 | 994.7052 |
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4
## Thanks to Greg Hogg https://youtu.be/qNF1HqBvpGE for this solution to Plotly export issues
--2022-06-24 11:57:17-- https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage Resolving github.com (github.com)... 140.82.112.4 Connecting to github.com (github.com)|140.82.112.4|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/99037241/9dc3a580-286a-11e9-8a21-4312b7c8a512?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220624%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220624T115717Z&X-Amz-Expires=300&X-Amz-Signature=14691362969b1efae950ffabbc65a1f11cfbb4d30a5e5074ca67d48d98fbd4a1&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=99037241&response-content-disposition=attachment%3B%20filename%3Dorca-1.2.1-x86_64.AppImage&response-content-type=application%2Foctet-stream [following] --2022-06-24 11:57:17-- https://objects.githubusercontent.com/github-production-release-asset-2e65be/99037241/9dc3a580-286a-11e9-8a21-4312b7c8a512?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220624%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220624T115717Z&X-Amz-Expires=300&X-Amz-Signature=14691362969b1efae950ffabbc65a1f11cfbb4d30a5e5074ca67d48d98fbd4a1&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=99037241&response-content-disposition=attachment%3B%20filename%3Dorca-1.2.1-x86_64.AppImage&response-content-type=application%2Foctet-stream Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ... Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 51607939 (49M) [application/octet-stream] Saving to: ‘/usr/local/bin/orca’ /usr/local/bin/orca 100%[===================>] 49.22M 41.4MB/s in 1.2s 2022-06-24 11:57:18 (41.4 MB/s) - ‘/usr/local/bin/orca’ saved [51607939/51607939] Reading package lists... Done Building dependency tree Reading state information... Done The following package was automatically installed and is no longer required: libnvidia-common-460 Use 'apt autoremove' to remove it. The following additional packages will be installed: gconf-service gconf-service-backend gconf2-common libdbus-glib-1-2 libgail-common libgail18 libgtk2.0-bin libgtk2.0-common Suggested packages: gvfs The following NEW packages will be installed: gconf-service gconf-service-backend gconf2-common libdbus-glib-1-2 libgail-common libgail18 libgconf-2-4 libgtk2.0-0 libgtk2.0-bin libgtk2.0-common xvfb 0 upgraded, 11 newly installed, 0 to remove and 49 not upgraded. Need to get 3,715 kB of archives. After this operation, 17.2 MB of additional disk space will be used. Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libdbus-glib-1-2 amd64 0.110-2 [58.3 kB] Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 gconf2-common all 3.2.6-4ubuntu1 [700 kB] Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libgconf-2-4 amd64 3.2.6-4ubuntu1 [84.8 kB] Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 gconf-service-backend amd64 3.2.6-4ubuntu1 [58.1 kB] Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 gconf-service amd64 3.2.6-4ubuntu1 [2,036 B] Get:6 http://archive.ubuntu.com/ubuntu bionic/main amd64 libgtk2.0-common all 2.24.32-1ubuntu1 [125 kB] Get:7 http://archive.ubuntu.com/ubuntu bionic/main amd64 libgtk2.0-0 amd64 2.24.32-1ubuntu1 [1,769 kB] Get:8 http://archive.ubuntu.com/ubuntu bionic/main amd64 libgail18 amd64 2.24.32-1ubuntu1 [14.2 kB] Get:9 http://archive.ubuntu.com/ubuntu bionic/main amd64 libgail-common amd64 2.24.32-1ubuntu1 [112 kB] Get:10 http://archive.ubuntu.com/ubuntu bionic/main amd64 libgtk2.0-bin amd64 2.24.32-1ubuntu1 [7,536 B] Get:11 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.10 [784 kB] Fetched 3,715 kB in 1s (3,154 kB/s) Selecting previously unselected package libdbus-glib-1-2:amd64. (Reading database ... 155639 files and directories currently installed.) Preparing to unpack .../00-libdbus-glib-1-2_0.110-2_amd64.deb ... Unpacking libdbus-glib-1-2:amd64 (0.110-2) ... Selecting previously unselected package gconf2-common. Preparing to unpack .../01-gconf2-common_3.2.6-4ubuntu1_all.deb ... Unpacking gconf2-common (3.2.6-4ubuntu1) ... Selecting previously unselected package libgconf-2-4:amd64. Preparing to unpack .../02-libgconf-2-4_3.2.6-4ubuntu1_amd64.deb ... Unpacking libgconf-2-4:amd64 (3.2.6-4ubuntu1) ... Selecting previously unselected package gconf-service-backend. Preparing to unpack .../03-gconf-service-backend_3.2.6-4ubuntu1_amd64.deb ... Unpacking gconf-service-backend (3.2.6-4ubuntu1) ... Selecting previously unselected package gconf-service. Preparing to unpack .../04-gconf-service_3.2.6-4ubuntu1_amd64.deb ... Unpacking gconf-service (3.2.6-4ubuntu1) ... Selecting previously unselected package libgtk2.0-common. Preparing to unpack .../05-libgtk2.0-common_2.24.32-1ubuntu1_all.deb ... Unpacking libgtk2.0-common (2.24.32-1ubuntu1) ... Selecting previously unselected package libgtk2.0-0:amd64. Preparing to unpack .../06-libgtk2.0-0_2.24.32-1ubuntu1_amd64.deb ... Unpacking libgtk2.0-0:amd64 (2.24.32-1ubuntu1) ... Selecting previously unselected package libgail18:amd64. Preparing to unpack .../07-libgail18_2.24.32-1ubuntu1_amd64.deb ... Unpacking libgail18:amd64 (2.24.32-1ubuntu1) ... Selecting previously unselected package libgail-common:amd64. Preparing to unpack .../08-libgail-common_2.24.32-1ubuntu1_amd64.deb ... Unpacking libgail-common:amd64 (2.24.32-1ubuntu1) ... Selecting previously unselected package libgtk2.0-bin. Preparing to unpack .../09-libgtk2.0-bin_2.24.32-1ubuntu1_amd64.deb ... Unpacking libgtk2.0-bin (2.24.32-1ubuntu1) ... Selecting previously unselected package xvfb. Preparing to unpack .../10-xvfb_2%3a1.19.6-1ubuntu4.10_amd64.deb ... Unpacking xvfb (2:1.19.6-1ubuntu4.10) ... Setting up gconf2-common (3.2.6-4ubuntu1) ... Creating config file /etc/gconf/2/path with new version Setting up libgtk2.0-common (2.24.32-1ubuntu1) ... Setting up libdbus-glib-1-2:amd64 (0.110-2) ... Setting up xvfb (2:1.19.6-1ubuntu4.10) ... Setting up libgconf-2-4:amd64 (3.2.6-4ubuntu1) ... Setting up libgtk2.0-0:amd64 (2.24.32-1ubuntu1) ... Setting up libgail18:amd64 (2.24.32-1ubuntu1) ... Setting up libgail-common:amd64 (2.24.32-1ubuntu1) ... Setting up libgtk2.0-bin (2.24.32-1ubuntu1) ... Setting up gconf-service-backend (3.2.6-4ubuntu1) ... Setting up gconf-service (3.2.6-4ubuntu1) ... Processing triggers for libc-bin (2.27-3ubuntu1.3) ... /sbin/ldconfig.real: /usr/local/lib/python3.7/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8523 entries, 0 to 8522 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Item_Identifier 8523 non-null object 1 Item_Weight 7060 non-null float64 2 Item_Fat_Content 8523 non-null object 3 Item_Visibility 8523 non-null float64 4 Item_Type 8523 non-null object 5 Item_MRP 8523 non-null float64 6 Outlet_Identifier 8523 non-null object 7 Outlet_Establishment_Year 8523 non-null int64 8 Outlet_Size 6113 non-null object 9 Outlet_Location_Type 8523 non-null object 10 Outlet_Type 8523 non-null object 11 Item_Outlet_Sales 8523 non-null float64 dtypes: float64(4), int64(1), object(7) memory usage: 799.2+ KB
df.duplicated().sum()
0
df.isna().sum()
Item_Identifier 0 Item_Weight 1463 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 2410 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
avg_weights_by_type = df.groupby("Item_Type")["Item_Weight"].mean()
for item_type in df["Item_Type"].unique():
df.loc[df["Item_Type"] == item_type,["Item_Weight"]] = \
df.loc[df["Item_Type"] == item_type,["Item_Weight"]].fillna(avg_weights_by_type[item_type])
df.isna().sum()
Item_Identifier 0 Item_Weight 0 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 2410 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
df["Outlet_Size"].unique()
array(['Medium', nan, 'High', 'Small'], dtype=object)
df.groupby("Outlet_Identifier")[["Outlet_Size","Outlet_Location_Type"]].count()
| Outlet_Size | Outlet_Location_Type | |
|---|---|---|
| Outlet_Identifier | ||
| OUT010 | 0 | 555 |
| OUT013 | 932 | 932 |
| OUT017 | 0 | 926 |
| OUT018 | 928 | 928 |
| OUT019 | 528 | 528 |
| OUT027 | 935 | 935 |
| OUT035 | 930 | 930 |
| OUT045 | 0 | 929 |
| OUT046 | 930 | 930 |
| OUT049 | 930 | 930 |
df["Outlet_Size"].fillna("Unknown", inplace=True)
df.isna().sum()
Item_Identifier 0 Item_Weight 0 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 0 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
df.loc[df["Outlet_Size"]=="High",["Outlet_Size"]] ="Large"
df["Outlet_Size"].unique()
array(['Medium', 'Unknown', 'Large', 'Small'], dtype=object)
df.nunique()
Item_Identifier 1559 Item_Weight 431 Item_Fat_Content 5 Item_Visibility 7880 Item_Type 16 Item_MRP 5938 Outlet_Identifier 10 Outlet_Establishment_Year 9 Outlet_Size 4 Outlet_Location_Type 3 Outlet_Type 4 Item_Outlet_Sales 3493 dtype: int64
df["Item_Fat_Content"].unique()
array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)
df["Item_Fat_Content"].replace(["low fat","LF"],"Low Fat", inplace=True)
df["Item_Fat_Content"].replace("reg","Regular", inplace=True)
df.nunique()
Item_Identifier 1559 Item_Weight 431 Item_Fat_Content 2 Item_Visibility 7880 Item_Type 16 Item_MRP 5938 Outlet_Identifier 10 Outlet_Establishment_Year 9 Outlet_Size 4 Outlet_Location_Type 3 Outlet_Type 4 Item_Outlet_Sales 3493 dtype: int64
df["Item_Type"].unique()
array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)
df.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| count | 8523.000000 | 8523.000000 | 8523.000000 | 8523.000000 | 8523.000000 |
| mean | 12.857890 | 0.066132 | 140.992782 | 1997.831867 | 2181.288914 |
| std | 4.232804 | 0.051598 | 62.275067 | 8.371760 | 1706.499616 |
| min | 4.555000 | 0.000000 | 31.290000 | 1985.000000 | 33.290000 |
| 25% | 9.310000 | 0.026989 | 93.826500 | 1987.000000 | 834.247400 |
| 50% | 12.867061 | 0.053931 | 143.012800 | 1999.000000 | 1794.331000 |
| 75% | 16.000000 | 0.094585 | 185.643700 | 2004.000000 | 3101.296400 |
| max | 21.350000 | 0.328391 | 266.888400 | 2009.000000 | 13086.964800 |
fig, ax = plt.subplots()
ax.boxplot([df["Item_Outlet_Sales"]],labels = ["Item Sales"],
notch=True,
patch_artist=True)
plt.show()
sales = df['Item_Outlet_Sales']
q1 = sales.quantile(0.25)
q3 = sales.quantile(0.75)
iqr = q3 - q1
outliers = df[sales > q3 + iqr * 1.5]
outliers
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 43 | FDC02 | 21.350000 | Low Fat | 0.069103 | Canned | 259.9278 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 6768.5228 |
| 130 | FDY25 | 12.305705 | Low Fat | 0.033810 | Canned | 180.5976 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 7968.2944 |
| 132 | NCR53 | 13.142314 | Low Fat | 0.144338 | Health and Hygiene | 224.4404 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6976.2524 |
| 145 | FDP16 | 18.600000 | Low Fat | 0.039356 | Frozen Foods | 246.3802 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 7370.4060 |
| 203 | FDI24 | 12.277108 | Low Fat | 0.078362 | Baking Goods | 177.9370 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6704.6060 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8245 | FDU55 | 16.200000 | Low Fat | 0.035967 | Fruits and Vegetables | 260.3278 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 7549.5062 |
| 8329 | NCQ06 | 13.384736 | Low Fat | 0.041622 | Household | 253.6014 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6630.0364 |
| 8350 | NCE18 | 10.000000 | Low Fat | 0.021421 | Household | 248.3750 | OUT035 | 2004 | Small | Tier 2 | Supermarket Type1 | 7240.5750 |
| 8447 | FDS26 | 20.350000 | Low Fat | 0.089975 | Dairy | 261.6594 | OUT017 | 2007 | Unknown | Tier 2 | Supermarket Type1 | 7588.1226 |
| 8510 | FDN58 | 13.800000 | Regular | 0.056862 | Snack Foods | 231.5984 | OUT035 | 2004 | Small | Tier 2 | Supermarket Type1 | 7182.6504 |
186 rows × 12 columns
df.drop(outliers.index,axis=0 ,inplace=True)
df.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| count | 8337.000000 | 8337.000000 | 8337.000000 | 8337.000000 | 8337.000000 |
| mean | 12.856296 | 0.066358 | 139.161087 | 1997.968094 | 2059.362844 |
| std | 4.256788 | 0.051838 | 61.553351 | 8.309941 | 1506.099754 |
| min | 4.555000 | 0.000000 | 31.290000 | 1985.000000 | 33.290000 |
| 25% | 9.300000 | 0.027028 | 93.046200 | 1987.000000 | 810.944400 |
| 50% | 12.867061 | 0.053939 | 141.215400 | 1999.000000 | 1747.059200 |
| 75% | 16.100000 | 0.095299 | 183.695000 | 2004.000000 | 2998.097400 |
| max | 21.350000 | 0.328391 | 266.888400 | 2009.000000 | 6478.234000 |
sns.heatmap(df.corr(),cmap="Blues",annot= True)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff4cbe41ad0>
x = df['Item_Visibility']
y = df['Item_Outlet_Sales']
a,b = np.polyfit(x,y,1)
fig, ax = plt.subplots()
ax.scatter(x,y, label = "datapoint")
ax.plot(x,a*x+b, "r-", label="Best Fit Line")
ax.legend()
ax.set_title("Total Sales Revenue by Visibility")
ax.set_xlabel("Visiblity")
ax.set_ylabel("Sales Revenue")
plt.show()
histogram = df.groupby("Outlet_Size")["Item_Outlet_Sales"].hist(alpha=0.35,legend=True)
large = df[df["Outlet_Size"]=="Large"]["Item_Outlet_Sales"]
med = df[df["Outlet_Size"]=="Medium"]["Item_Outlet_Sales"]
small = df[df["Outlet_Size"]=="Small"]["Item_Outlet_Sales"]
uk = df[df["Outlet_Size"]=="Unknown"]["Item_Outlet_Sales"]
fig, ax = plt.subplots()
bps = ax.boxplot([small, med, large, uk],labels=["Small","Medium","Large","Unknown"], notch=True,
patch_artist=True)
plt.show()
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
df.groupby("Outlet_Size")["Item_Outlet_Sales"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Outlet_Size | ||||||||
| Large | 917.0 | 2217.100205 | 1398.677744 | 73.2380 | 1057.95620 | 2014.7108 | 3103.95960 | 6474.2392 |
| Medium | 2676.0 | 2458.690424 | 1530.464302 | 69.2432 | 1238.22155 | 2135.8864 | 3448.84400 | 6478.2340 |
| Small | 2362.0 | 1852.606258 | 1483.026265 | 33.9558 | 593.22780 | 1517.3582 | 2775.55375 | 6474.2392 |
| Unknown | 2382.0 | 1755.044328 | 1432.707736 | 33.2900 | 549.28500 | 1417.4882 | 2631.24160 | 6471.5760 |
fig = px.histogram(df,x="Outlet_Identifier", y="Item_Outlet_Sales",
title="Total Sales Per Outlet",
hover_data=['Outlet_Size'],
color ="Outlet_Size",
labels = {"Outlet_Size": "Outlet Size",
"Outlet_Identifier": "Outlet ID",
"Item_Outlet_Sales": "Total Sales"},
category_orders = {"Outlet_Size":["Small",
"Medium",
"Large",
"Unknown"]})
py.iplot(fig)
# fig.show("png")
fig = px.histogram(df,x="Item_Type", y="Item_Outlet_Sales",
title="Total Sales Per Item Type",
hover_data=['Outlet_Type'],
color ="Outlet_Type",
labels = {"Outlet_Type": "Outlet Type",
"Item_Type": "Item Type",
"Item_Outlet_Sales": "Total Sales"},
category_orders = {"Outlet_Type":["Supermarket Type1",
"Supermarket Type2",
"Supermarket Type3",
"Grocery Store"]},)
py.iplot(fig)
# fig.show("png")
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
y = ml_df["Item_Outlet_Sales"]
numeric_features = ["Item_Weight",
"Item_Visibility",
"Item_MRP",
"Outlet_Establishment_Year"]
ordinal_encoded_features = ["Outlet_Size",
"Outlet_Type",
"Outlet_Identifier",
"Item_Type",
"Item_Fat_Content"]
# Include all fatures to be included in the model
X = ml_df[numeric_features + ordinal_encoded_features + ["Item_Identifier"]]
X.head()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Outlet_Size | Outlet_Type | Outlet_Identifier | Item_Type | Item_Fat_Content | Item_Identifier | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9.30 | 0.016047 | 249.8092 | 1999 | Medium | Supermarket Type1 | OUT049 | Dairy | Low Fat | FDA15 |
| 1 | 5.92 | 0.019278 | 48.2692 | 2009 | Medium | Supermarket Type2 | OUT018 | Soft Drinks | Regular | DRC01 |
| 2 | 17.50 | 0.016760 | 141.6180 | 1999 | Medium | Supermarket Type1 | OUT049 | Meat | Low Fat | FDN15 |
| 3 | 19.20 | 0.000000 | 182.0950 | 1998 | NaN | Grocery Store | OUT010 | Fruits and Vegetables | Regular | FDX07 |
| 4 | 8.93 | 0.000000 | 53.8614 | 1987 | High | Supermarket Type1 | OUT013 | Household | Low Fat | NCD19 |
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)
class WeightsByIdImputer(BaseEstimator, TransformerMixin):
def __init__(self):
self.known_weights = {}
def fit(self, X, y = None):
for id in X["Item_Identifier"].unique():
id_weight_series = X[X["Item_Identifier"] == id ]["Item_Weight"]
if id_weight_series.notna().sum() > 0:
self.known_weights[id] = id_weight_series.mode()[0]
return self
def transform(self, X, y=None):
X_copy = X.copy()
for id in X_copy[X_copy["Item_Weight"].isna()]["Item_Identifier"].unique():
if id in self.known_weights:
X_copy.loc[X_copy["Item_Identifier"] == id,["Item_Weight"]] = \
X_copy.loc[X_copy["Item_Identifier"] == id,["Item_Weight"]].fillna(self.known_weights[id])
return X_copy
class WeightsByTypeImputer(BaseEstimator, TransformerMixin):
def __init__(self):
self.averages = {}
def fit(self, X, y = None):
self.averages["default"] = X["Item_Weight"].mean()
for item_type in X["Item_Type"].unique():
self.averages[item_type] = X[X["Item_Type"] == item_type]["Item_Weight"].mean()
return self
def transform(self, X, y=None):
X_copy = X.copy()
for item_type in X["Item_Type"].unique():
if item_type in self.averages:
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]] = \
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]].fillna(self.averages[item_type])
else:
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]] = \
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]].fillna(self.averages["default"])
return X_copy
scaler = StandardScaler()
missing_imputer = SimpleImputer(strategy = "constant", fill_value = "Unknown")
ord_enc = OrdinalEncoder()
cat_pipe = make_pipeline(missing_imputer, ord_enc)
transformer = make_column_transformer((scaler,numeric_features),
(cat_pipe, ordinal_encoded_features))
know_weight_imp = WeightsByIdImputer()
weight_type_imp = WeightsByTypeImputer()
pipe = make_pipeline(know_weight_imp,weight_type_imp,transformer)
pipe.fit(X_train,y_train)
X_train = pd.DataFrame(pipe.transform(X_train), columns = numeric_features + ordinal_encoded_features)
X_valid = pd.DataFrame(pipe.transform(X_valid), columns = numeric_features + ordinal_encoded_features)
X_train.head()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Outlet_Size | Outlet_Type | Outlet_Identifier | Item_Type | Item_Fat_Content | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.247802 | 0.858650 | 0.486773 | 0.499522 | 3.0 | 1.0 | 7.0 | 6.0 | 1.0 |
| 1 | -0.065395 | 0.157762 | 1.838334 | 1.095443 | 3.0 | 1.0 | 2.0 | 4.0 | 2.0 |
| 2 | -1.050018 | 0.881784 | 0.207711 | 0.737890 | 2.0 | 1.0 | 6.0 | 13.0 | 1.0 |
| 3 | 0.601052 | 1.096025 | -0.134189 | -1.288241 | 0.0 | 1.0 | 1.0 | 3.0 | 1.0 |
| 4 | -0.237382 | -1.126598 | -0.391272 | 0.737890 | 2.0 | 1.0 | 6.0 | 8.0 | 1.0 |
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
LinearRegression()
def get_MAE(model,X,y):
return mean_absolute_error(y,model.predict(X))
def get_percent(model,X,y):
return 1-(get_MAE(model,X,y)/y.mean())
def get_MSE(model,X,y):
return mean_squared_error(y,model.predict(X))
def get_RMSE(model,X,y):
return np.sqrt(get_MSE(model,X,y))
def get_r2(model,X,y):
return r2_score(y,model.predict(X))
def print_stats(model,X,y):
print("The MAE for this model is {:.2f}".format(get_MAE(model,X,y)))
print("The MSE for this model is {:.2f}".format(get_MSE(model,X,y)))
print("The RMAE for this model is {:.2f}".format(get_RMSE(model,X,y)))
print("The r2 for this model is {:.2f}".format(get_r2(model,X,y)))
print("The accuracy (1 - MAE / y.mean ) for this model is {:%}".format(get_percent(model,X,y)))
print_stats(lin_reg,X_valid,y_valid)
The MAE for this model is 918.55 The MSE for this model is 1457870.80 The RMAE for this model is 1207.42 The r2 for this model is 0.50 The accuracy (1 - MAE / y.mean ) for this model is 57.475388%
model = RandomForestRegressor(random_state=1)
model.fit(X_train,y_train)
print_stats(model,X_valid,y_valid)
The MAE for this model is 794.56 The MSE for this model is 1291149.74 The RMAE for this model is 1136.29 The r2 for this model is 0.55 The accuracy (1 - MAE / y.mean ) for this model is 63.215830%
results = {}
for i in range(1,7):
test_model = RandomForestRegressor(n_estimators=i*50,random_state=1)
test_model.fit(X_train,y_train)
results[i*50] = get_MAE(test_model,X_valid,y_valid)
print(f"{i*50}:")
print_stats(test_model,X_valid,y_valid)
50: The MAE for this model is 796.24 The MSE for this model is 1299342.97 The RMAE for this model is 1139.89 The r2 for this model is 0.55 The accuracy (1 - MAE / y.mean ) for this model is 63.137704% 100: The MAE for this model is 794.56 The MSE for this model is 1291149.74 The RMAE for this model is 1136.29 The r2 for this model is 0.55 The accuracy (1 - MAE / y.mean ) for this model is 63.215830% 150: The MAE for this model is 793.81 The MSE for this model is 1288509.92 The RMAE for this model is 1135.13 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.250524% 200: The MAE for this model is 791.49 The MSE for this model is 1282953.99 The RMAE for this model is 1132.68 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.357740% 250: The MAE for this model is 792.05 The MSE for this model is 1285482.33 The RMAE for this model is 1133.79 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.331717% 300: The MAE for this model is 790.93 The MSE for this model is 1280898.93 The RMAE for this model is 1131.77 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.383495%
plt.plot([*results.keys()],[*results.values()] )
plt.gca().invert_yaxis()
plt.show()
from xgboost.sklearn import XGBRegressor
model2 = XGBRegressor(n_estimators = 2000,
learning_rate=0.005,
early_stopping_rounds = 10,
random_state= 1)
model2.fit(X_train,y_train,
eval_set= [(X_valid, y_valid)],
verbose = False)
print_stats(model2,X_valid,y_valid)
[11:58:33] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. The MAE for this model is 759.09 The MSE for this model is 1163752.02 The RMAE for this model is 1078.77 The r2 for this model is 0.60 The accuracy (1 - MAE / y.mean ) for this model is 64.857953%
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
model3 = keras.Sequential([
layers.BatchNormalization(input_shape = [X_train.shape[1]]),
layers.Dense(256,activation="relu"),
layers.Dropout(0.3),
layers.BatchNormalization(),
layers.Dense(256,activation="relu"),
layers.Dropout(0.3),
layers.Dense(1)
])
early_stopping = callbacks.EarlyStopping(
min_delta = 0.001,
patience = 20,
restore_best_weights = True
)
model3.compile(
optimizer="adam",
loss="mse",
metrics=['mae'],
)
history = model3.fit(
X_train, y_train,
validation_data=(X_valid, y_valid),
batch_size=128,
callbacks=[early_stopping],
epochs=100,
verbose=0,
)
history_df = pd.DataFrame(history.history)
history_df['val_mae'].plot()
best_results = history_df['val_mae'].min()
print("Minimum Validation Loss: {:0.4f}".format(best_results))
print("Highest Accuracy: {:0.4%}".format(1-(best_results/df['Item_Outlet_Sales'].mean())))
Minimum Validation Loss: 750.5770 Highest Accuracy: 63.5530%
from sklearn.model_selection import GridSearchCV
param_grid = {
"n_estimators": [x for x in range(100,251,50)],
"max_features": [1.0,"sqrt","log2"],
"min_weight_fraction_leaf": [0.0, 0.1, 0.01],
}
grid_search = GridSearchCV( estimator = RandomForestRegressor(),
param_grid = param_grid,
scoring = "neg_mean_absolute_error",
n_jobs = 4,
verbose = 2)
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
GridSearchCV(estimator=RandomForestRegressor(), n_jobs=4,
param_grid={'max_features': [1.0, 'sqrt', 'log2'],
'min_weight_fraction_leaf': [0.0, 0.1, 0.01],
'n_estimators': [100, 150, 200, 250]},
scoring='neg_mean_absolute_error', verbose=2)
print_stats(grid_search,X_valid,y_valid)
grid_search.best_params_
The MAE for this model is 760.47 The MSE for this model is 1168559.49 The RMAE for this model is 1081.00 The r2 for this model is 0.60 The accuracy (1 - MAE / y.mean ) for this model is 64.793769%
{'max_features': 1.0, 'min_weight_fraction_leaf': 0.01, 'n_estimators': 250}